1 Example: Monthly electricity sales for Virginia

1.1 Extract data from remote database

esales <- dbGetQuery(db,'SELECT * from eia_elec_sales_va_all_m') # SQL code to retrieve data from a table in the remote database
# str(esales)
esales <- as_tibble(esales) # Convert dataframe to a 'tibble' for tidyverse work
# str(esales)
# Reference: https://arrow.apache.org/docs/r/
# if(!('arrow' %in% installed.packages())) install.packages('arrow')
library(arrow)
write_feather(esales, "esales.feather")
# Close connection -- this is good practice
dbDisconnect(db)
dbUnloadDriver(db_driver)

1.2 Briefly characterize the dataset

library(arrow)
esales <- read_feather("esales.feather")
print(esales)    # print the data as a table
summary(esales)  # compute basic summary statistics about the data
     value            date                 year          month       
 Min.   : 7153   Min.   :2001-01-01   Min.   :2001   Min.   : 1.000  
 1st Qu.: 8200   1st Qu.:2005-11-01   1st Qu.:2005   1st Qu.: 3.000  
 Median : 9019   Median :2010-09-01   Median :2010   Median : 6.000  
 Mean   : 9093   Mean   :2010-08-31   Mean   :2010   Mean   : 6.425  
 3rd Qu.: 9885   3rd Qu.:2015-07-01   3rd Qu.:2015   3rd Qu.: 9.000  
 Max.   :11724   Max.   :2020-05-01   Max.   :2020   Max.   :12.000  
# References: https://www.tidyverse.org/, https://dplyr.tidyverse.org/

esales %>%
  filter(year == 2019) %>%
  filter(value > 9000) %>%
  print()

esales %>%
  group_by(month) %>%
  summarise(mean = mean(value)) -> mean_esales_by_month

esales %>%
  mutate(sales_TWh = value/1000) %>%
  select(-value)
  
# filter(data object, condition) : syntax for filter() command

1.3 Plot the time series

#Reference: https://ggplot2.tidyverse.org/

ggplot(data=esales, aes(x=date,y=value)) + 
  geom_line() + xlab("Year") + ylab("Virginia monthly total electricity sales (GWh)")

1.4 Convert the data frame into a time series tsibble object

library(lubridate) # Make it easy to deal with dates

esales %>% filter(month==3)                   # These three lines of code
esales %>% filter(month(date)==3)             #   all do
esales %>% filter(lubridate::month(date)==3)  #   the same thing.

# We don't have to keep the 'year' and 'month' column: can recover them if needed

esales %>%
  select(date, sales_GWh = value) -> esales_tbl

print(elsales_tbl)
# install.packages("tsibble")
library(tsibble) # Reference: https://tsibble.tidyverts.org/articles/intro-tsibble.html

esales_tbl %>% as_tsibble(index = date) -> elsales_tbl_ts

print(elsales_tbl_ts)

1.5 Make some plots

1.5.1 Make a histogram of monthly sales

hist(elsales_tbl_ts$sales_GWh, breaks=40)

1.5.2 Make a seasonal plot

elsales_tbl_ts %>% 
  feasts::gg_season(sales_GWh, labels = "both") + ylab("Virginia electricity sales (GWh)")
# install.packages("feasts"), Reference: https://feasts.tidyverts.org/
library(feasts)

elsales_tbl_ts %>% 
  mutate(Month = tsibble::yearmonth(date)) %>% 
  as_tsibble(index = Month) %>%
  select(Month,sales_GWh) -> vaelsales_tbl_ts

print(vaelsales_tbl_ts)
vaelsales_tbl_ts %>% gg_season(sales_GWh, labels = "both") + ylab("Virginia electricity sales (GWh)")

vaelsales_tbl_ts %>% 
  gg_subseries(sales_GWh)

vaelsales_tbl_ts  %>% filter(month(Month) %in% c(3,6,9,12)) %>% gg_lag(sales_GWh, lags = 1:2)


vaelsales_tbl_ts  %>% filter(month(Month) == 1) %>% gg_lag(sales_GWh, lags = 1:2)

vaelsales_tbl_ts %>% ACF(sales_GWh) %>% autoplot()

# if(!('fpp3' %in% installed.packages())) install.packages('fpp3')
library(fpp3)
# decompose(vaelsales_tbl_ts)
vaelsales_tbl_ts %>%
  model(STL(sales_GWh ~ trend(window=21) + season(window='periodic'), robust = TRUE)) %>%
  components() %>%
  autoplot()

vaelsales_tbl_ts %>%
  mutate(ln_sales_GWh = log(sales_GWh)) %>%
  model(STL(ln_sales_GWh ~ trend(window=21) + season(window='periodic'),
    robust = TRUE)) %>%
  components() %>%
  autoplot()

vaelsales_tbl_ts %>%
  features(sales_GWh, feat_stl)
vaelsales_tbl_ts %>%
  features(sales_GWh, feature_set(pkgs=\feasts\))

2 Example: Australian production

# install.packages('tsibbledata')
library(tsibbledata)

aus_production

aus_production %>% gg_season(Electricity)

aus_production %>% gg_season(Beer)

3 Example: Gross Domestic Product data

3.1 Exploratory data analysis

library(tsibbledata) # Data sets package

print(global_economy)
global_economy %>% filter(Country==\Sweden\) %>% print()
# A tsibble: 58 x 9 [1Y]
# Key:       Country [1]
   Country Code   Year          GDP Growth   CPI Imports Exports Population
   <fct>   <fct> <dbl>        <dbl>  <dbl> <dbl>   <dbl>   <dbl>      <dbl>
 1 Sweden  SWE    1960 14842870293.  NA     9.21    23.4    23.0    7484656
 2 Sweden  SWE    1961 16147160123.   5.68  9.41    21.7    22.3    7519998
 3 Sweden  SWE    1962 17511477311.   4.26  9.86    21.4    21.9    7561588
 4 Sweden  SWE    1963 18954132366.   5.33 10.1     21.5    21.9    7604328
 5 Sweden  SWE    1964 21137242561.   6.82 10.5     21.9    22.3    7661354
 6 Sweden  SWE    1965 23260320646.   3.82 11.0     22.5    21.9    7733853
 7 Sweden  SWE    1966 25302033132.   2.09 11.7     21.9    21.4    7807797
 8 Sweden  SWE    1967 27463409202.   3.37 12.2     21.0    21.1    7867931
 9 Sweden  SWE    1968 29143383491.   3.64 12.5     21.6    21.6    7912273
10 Sweden  SWE    1969 31649203886.   5.01 12.8     23.0    22.8    7968072
# … with 48 more rows
global_economy %>%
  filter(Country==\Sweden\) %>%
  autoplot(GDP) +
  ggtitle(\GDP for Sweden\) + ylab(\$US billions\)

3.2 Fitting data to simple models

global_economy %>% model(trend_model = TSLM(GDP ~ trend())) -> fit
fit
fit %>% filter(Country == \Sweden\) %>% residuals()
fit %>% filter(Country == \Sweden\) %>% residuals() %>% autoplot(.resid)

3.2.1 Work with ln(GDP)

global_economy %>%
  filter(Country==\Sweden\) %>%
  autoplot(log(GDP)) +
  ggtitle(\ln(GDP) for Sweden\) + ylab(\$US billions\)

global_economy %>%
  model(trend_model = TSLM(log(GDP) ~ trend())) -> logfit
logfit %>% filter(Country == \Sweden\) %>% residuals() %>% autoplot()

global_economy %>% model(trend_model = TSLM(log(GDP) ~ log(Population))) -> fit3
fit3 %>% filter(Country == \Sweden\) %>% residuals() %>% autoplot()

4 Producing forecasts

fit %>% forecast(h = \3 years\) -> fcast3yrs

fcast3yrs
fcast3yrs %>% filter(Country == \Sweden\, Year == 2020) %>% str()
fable [1 × 5] (S3: fbl_ts/tbl_ts/tbl_df/tbl/data.frame)
 $ Country: Factor w/ 263 levels \Afghanistan\,..: 232
 $ .model : chr \trend_model\
 $ Year   : num 2020
 $ GDP    : dist [1:1] 
  ..$ 3:List of 2
  .. ..$ mu   : num 5.45e+11
  .. ..$ sigma: num 5.34e+10
  .. ..- attr(*, \class\)= chr [1:2] \dist_normal\ \dist_default\
  ..@ vars: chr \GDP\
 $ .mean  : num 5.45e+11
 - attr(*, \key\)= tibble [1 × 3] (S3: tbl_df/tbl/data.frame)
  ..$ Country: Factor w/ 263 levels \Afghanistan\,..: 232
  ..$ .model : chr \trend_model\
  ..$ .rows  : list<int> [1:1] 
  .. ..$ : int 1
  .. ..@ ptype: int(0) 
  ..- attr(*, \.drop\)= logi TRUE
 - attr(*, \index\)= chr \Year\
  ..- attr(*, \ordered\)= logi TRUE
 - attr(*, \index2\)= chr \Year\
 - attr(*, \interval\)= interval [1:1] 1Y
  ..@ .regular: logi TRUE
 - attr(*, \response\)= chr \GDP\
 - attr(*, \dist\)= chr \GDP\
 - attr(*, \model_cn\)= chr \.model\
fcast3yrs %>% 
  filter(Country==\Sweden\) %>%
  autoplot(global_economy) +
  ggtitle(\GDP for Sweden\) + ylab(\$US billions\)

4.1 Model residuals vs. forecast errors

Model residuals:

Your data: \(y_1, y_2, \ldots, y_T\)

Fitted values: \(\hat{y}_1, \hat{y}_2, \ldots, \hat{y}_T\)

Model residuals: \(e_t = y_t - \hat{y}_t\)

Forecast errors:

augment(fit)
augment(fit) %>% filter(Country == \Sweden\) %>%
  ggplot(aes(x = .resid)) +
  geom_histogram(bins = 20) +
  ggtitle(\Histogram of residuals\)

4.2 Are the model residuals auto-correlated?

augment(fit) %>% filter(Country == \Sweden\) -> augSweden

augSweden %>%
  ACF(.resid) %>%
  autoplot() + ggtitle(\ACF of residuals\)

augment(fit3) %>% filter(Country == \Sweden\) -> augSweden3

augSweden3 %>%
  ACF(.resid) %>%
  autoplot() + ggtitle(\ACF of residuals\)

5 Example: GDP, several countries

library(tsibbledata) # Data sets package

nordic <- c(\Sweden\, \Denmark\, \Norway\, \Finland\)

(global_economy %>% filter(Country %in% nordic) -> nordic_economy)
nordic_economy %>% autoplot(GDP)

fitnord <- nordic_economy %>%
  model(
    trend_model = TSLM(GDP ~ trend()),
    trend_model_ln = TSLM(log(GDP) ~ trend()),
    ets = ETS(GDP ~ trend(\A\)),
    arima = ARIMA(GDP)
  )

fitnord
fitnord %>%
  select(arima) %>%
  coef()

Denmark: ARMA(1,1)

Finland: MA(2)

Norway: MA(1)

Sweden: MA(2)

nordic_economy %>%
  model(arima_constrained = ARIMA(GDP ~ pdq(1,0,2))) %>% select(arima_constrained) %>% coef()
fitnord %>% coef() 
fitnord %>%  glance()  
fitnord %>% filter(Country == \Denmark\) %>% select(arima) %>% report()
Series: GDP 
Model: ARIMA(1,1,1) 

Coefficients:
          ar1     ma1
      -0.3898  0.7240
s.e.   0.2061  0.1434

sigma^2 estimated as 2.407e+20:  log likelihood=-1417.5
AIC=2840.99   AICc=2841.45   BIC=2847.12
fitnord %>%
  accuracy() %>%
  arrange(Country, MPE)
# ETS forecasts
USAccDeaths %>%
  ets() %>%
  forecast() %>%
  autoplot()
str(taylor)
plot(taylor)

6 References

---
title:     "Exploratory analysis of time series data: Examples"
institute: "SYS 5581 Time Series & Forecasting, Spring 2021" 
author:     "Instructor: Arthur Small"
date:       "Version of `r Sys.Date()`"
output: 
  html_notebook:
    number_sections: true
    toc: yes
    toc_depth: 4
    code_folding: show # options: show, hide
    fig_caption: yes
  # html_document:
  #       keep_md: yes
  # pdf_document: default
bibliography: /Users/Arthur/GitRepos/Teaching/time-series/tseries.bib
link-citations: yes
---

```{r set up coding environment, include=FALSE, message=FALSE}
# library(dplyr) -- don't need this if you are loading the entire 'tidyverse' suite
library(tidyverse)
library(lubridate) # For easy handling of time-indexed objects

# if(!('fpp3' %in% installed.packages())) install.packages('fpp3')
library(fpp3)
```


# Example: Monthly electricity sales for Virginia

## Extract data from remote database

```{r open connection to database, eval=FALSE, include=FALSE}
# Open connection to a remote database
# Make sure your VPN network connection is active if needed!

# if(!('RPostgreSQL' %in% installed.packages())) install.packages('RPostgreSQL')
library(RPostgreSQL)

# "my_postgres_credentials.R" contains the log-in information
source("/Users/Arthur/GitRepos/Teaching/my_postgres_db_credentials.R")

# Open connection
db_driver <- dbDriver("PostgreSQL")
db <- dbConnect(db_driver,user=user, password=password,dbname="postgres", host=host)
rm(password) 

# check the connection: If function returns value TRUE, the connection is working
dbExistsTable(db, "metadata")
```

```{r retrieve data from db, eval=FALSE, message=FALSE}
esales <- dbGetQuery(db,'SELECT * from eia_elec_sales_va_all_m') # SQL code to retrieve data from a table in the remote database
# str(esales)
esales <- as_tibble(esales) # Convert dataframe to a 'tibble' for tidyverse work
# str(esales)
```

```{r save data in Apache Arrow format, eval=FALSE}
# Reference: https://arrow.apache.org/docs/r/
# if(!('arrow' %in% installed.packages())) install.packages('arrow')
library(arrow)
write_feather(esales, "esales.feather")
```

```{r close db connection, eval=FALSE}
# Close connection -- this is good practice
dbDisconnect(db)
dbUnloadDriver(db_driver)
```

## Briefly characterize the dataset

```{r read in data}
library(arrow)
esales <- read_feather("esales.feather")
```

```{r }
print(esales)    # print the data as a table
summary(esales)  # compute basic summary statistics about the data
```

```{r use tidyverse syntax to perform some simple data manipulations}
# References: https://www.tidyverse.org/, https://dplyr.tidyverse.org/

esales %>%
  filter(year == 2019) %>%
  filter(value > 9000) %>%
  print()

esales %>%
  group_by(month) %>%
  summarise(mean = mean(value)) -> mean_esales_by_month

esales %>%
  mutate(sales_TWh = value/1000) %>%
  select(-value)
  
# filter(data object, condition) : syntax for filter() command
```

## Plot the time series

```{r use ggplot2 to generate a plot}
#Reference: https://ggplot2.tidyverse.org/

ggplot(data=esales, aes(x=date,y=value)) + 
  geom_line() + xlab("Year") + ylab("Virginia monthly total electricity sales (GWh)")
```
## Convert the data frame into a time series `tsibble` object

```{r}
library(lubridate) # Make it easy to deal with dates

esales %>% filter(month==3)                   # These three lines of code
esales %>% filter(month(date)==3)             #   all do
esales %>% filter(lubridate::month(date)==3)  #   the same thing.

# We don't have to keep the 'year' and 'month' column: can recover them if needed

esales %>%
  select(date, sales_GWh = value) -> esales_tbl

print(elsales_tbl)
```

```{r}
# install.packages("tsibble")
library(tsibble) # Reference: https://tsibble.tidyverts.org/articles/intro-tsibble.html

esales_tbl %>% as_tsibble(index = date) -> elsales_tbl_ts

print(elsales_tbl_ts)
```

## Make some plots

###  Make a histogram of monthly sales

```{r make a histogram of the data}
hist(elsales_tbl_ts$sales_GWh, breaks=40)
```

###  Make a seasonal plot

```{r, eval=FALSE}
elsales_tbl_ts %>% 
  feasts::gg_season(sales_GWh, labels = "both") + ylab("Virginia electricity sales (GWh)")
```

```{r}
# install.packages("feasts"), Reference: https://feasts.tidyverts.org/
library(feasts)

elsales_tbl_ts %>% 
  mutate(Month = tsibble::yearmonth(date)) %>% 
  as_tsibble(index = Month) %>%
  select(Month,sales_GWh) -> vaelsales_tbl_ts

print(vaelsales_tbl_ts)
```

```{r , warning=FALSE}
vaelsales_tbl_ts %>% gg_season(sales_GWh, labels = "both") + ylab("Virginia electricity sales (GWh)")
```


```{r}
vaelsales_tbl_ts %>% 
  gg_subseries(sales_GWh)
```

```{r plot lagged values}
vaelsales_tbl_ts  %>% filter(month(Month) %in% c(3,6,9,12)) %>% gg_lag(sales_GWh, lags = 1:2)

vaelsales_tbl_ts  %>% filter(month(Month) == 1) %>% gg_lag(sales_GWh, lags = 1:2)
```

```{r}
vaelsales_tbl_ts %>% ACF(sales_GWh) %>% autoplot()
```

```{r perform automated time series decomposition}


# decompose(vaelsales_tbl_ts)
```

```{r perform additive STL decomposition of the VA electricity sales time series}
vaelsales_tbl_ts %>%
  model(STL(sales_GWh ~ trend(window=21) + season(window='periodic'), robust = TRUE)) %>%
  components() %>%
  autoplot()
```

```{r perform multiplicative STL decomposition of the VA electricity sales time series}
vaelsales_tbl_ts %>%
  mutate(ln_sales_GWh = log(sales_GWh)) %>%
  model(STL(ln_sales_GWh ~ trend(window=21) + season(window='periodic'),
    robust = TRUE)) %>%
  components() %>%
  autoplot()
```

```{r}
vaelsales_tbl_ts %>%
  features(sales_GWh, feat_stl)
```

```{r}
vaelsales_tbl_ts %>%
  features(sales_GWh, feature_set(pkgs="feasts"))
```

# Example: Australian production

```{r, warning=FALSE}
# install.packages('tsibbledata')
library(tsibbledata)

aus_production

aus_production %>% gg_season(Electricity)

aus_production %>% gg_season(Beer)
```

# Example: Gross Domestic Product data

## Exploratory data analysis

```{r}
library(tsibbledata) # Data sets package

print(global_economy)
```

```{r}
global_economy %>% filter(Country=="Sweden") %>% print()
```

```{r}
global_economy %>%
  filter(Country=="Sweden") %>%
  autoplot(GDP) +
  ggtitle("GDP for Sweden") + ylab("$US billions")
```

## Fitting data to simple models

```{r}
global_economy %>% model(trend_model = TSLM(GDP ~ trend())) -> fit

fit


```

```{r}
fit %>% filter(Country == "Sweden") %>% residuals()
```

```{r}

fit %>% filter(Country == "Sweden") %>% residuals() %>% autoplot(.resid)
```

### Work with ln(GDP)

```{r}
global_economy %>%
  filter(Country=="Sweden") %>%
  autoplot(log(GDP)) +
  ggtitle("ln(GDP) for Sweden") + ylab("$US billions")
```

```{r}
global_economy %>%
  model(trend_model = TSLM(log(GDP) ~ trend())) -> logfit
```

```{r}
logfit %>% filter(Country == "Sweden") %>% residuals() %>% autoplot()
```

```{r}
global_economy %>% model(trend_model = TSLM(log(GDP) ~ log(Population))) -> fit3

fit3 %>% filter(Country == "Sweden") %>% residuals() %>% autoplot()

```

# Producing forecasts

```{r}
fit %>% forecast(h = "3 years") -> fcast3yrs

fcast3yrs

```

```{r}

fcast3yrs %>% filter(Country == "Sweden", Year == 2020) %>% str()
```

```{r visualize forecasts}
fcast3yrs %>% 
  filter(Country=="Sweden") %>%
  autoplot(global_economy) +
  ggtitle("GDP for Sweden") + ylab("$US billions")
```

## Model residuals vs. forecast errors

Model residuals:

Your data: $y_1, y_2, \ldots, y_T$

Fitted values: $\hat{y}_1, \hat{y}_2, \ldots, \hat{y}_T$

Model residuals: $e_t = y_t - \hat{y}_t$

Forecast errors:

```{r}
augment(fit)
```

```{r}
augment(fit) %>% filter(Country == "Sweden") %>%
  ggplot(aes(x = .resid)) +
  geom_histogram(bins = 20) +
  ggtitle("Histogram of residuals")
```

## Are the model residuals auto-correlated?

```{r}
augment(fit) %>% filter(Country == "Sweden") -> augSweden

augSweden %>%
  ACF(.resid) %>%
  autoplot() + ggtitle("ACF of residuals")
```

```{r}
augment(fit3) %>% filter(Country == "Sweden") -> augSweden3

augSweden3 %>%
  ACF(.resid) %>%
  autoplot() + ggtitle("ACF of residuals")
```


# Example: GDP, several countries


```{r}
library(tsibbledata) # Data sets package

nordic <- c("Sweden", "Denmark", "Norway", "Finland")

(global_economy %>% filter(Country %in% nordic) -> nordic_economy)

```

```{r}
nordic_economy %>% autoplot(GDP)
```

```{r}
fitnord <- nordic_economy %>%
  model(
    trend_model = TSLM(GDP ~ trend()),
    trend_model_ln = TSLM(log(GDP) ~ trend()),
    ets = ETS(GDP ~ trend("A")),
    arima = ARIMA(GDP)
  )

fitnord
```

```{r}
fitnord %>%
  select(arima) %>%
  coef()
```


Denmark: ARMA(1,1)

Finland: MA(2)

Norway: MA(1)

Sweden: MA(2)

```{r}
nordic_economy %>%
  model(arima_constrained = ARIMA(GDP ~ pdq(1,0,2))) %>% select(arima_constrained) %>% coef()
```

```{r}
fitnord %>% coef() 
```

```{r}
fitnord %>%  glance()  
```

```{r}
fitnord %>% filter(Country == "Denmark") %>% select(arima) %>% report()
```

```{r}
fitnord %>%
  accuracy() %>%
  arrange(Country, MPE)
```



```{r, eval=FALSE}
# ETS forecasts
USAccDeaths %>%
  ets() %>%
  forecast() %>%
  autoplot()
```

```{r, eval=FALSE}
str(taylor)
plot(taylor)
```



# References
